package WebPAC::Input::Gutenberg;

use warnings;
use strict;

use WebPAC::Input;
use base qw/WebPAC::Common/;
use XML::LibXML;
use Data::Dump qw/dump/;
use Encode qw/encode_utf8/;

=head1 NAME

WebPAC::Input::Gutenberg - support for RDF catalog data from Project Gutenberg

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';


=head1 SYNOPSIS

Read catalog data from Project Gutemberg (uncompressed!) and create
pseudo-MARC records from them.

 my $ll_db = new WebPAC::Input::Gutenberg(
	path => '/path/to/catalog.rdf',
 );

=head1 FUNCTIONS

=head2 new

Returns new low-level input API object

  my $ll_db = new WebPAC::Input::Gutenberg(
  	path => '/path/to/catalog.rdf'
	filter => sub {
		my ($l,$field_nr) = @_;
		# do something with $l which is line of input file
		return $l;
	},
  }

Options:

=over 4

=item path

path to Project Gutenberg RDF catalog file

=back

=cut

sub new {
	my $class = shift;
	my $self = {@_};
	bless($self, $class);

	my $arg = {@_};

	my $log = $self->_get_logger();

	$log->info("opening Project Gutenberg RDF catalog '$arg->{path}'");

	my $parser = XML::LibXML->new ();
	$parser->keep_blanks (0);
	my $doc = $parser->parse_file( $arg->{path} );

	$log->info("parsing over, finding book nodes");
	my $booknodes = $doc->findnodes ('/rdf:RDF/pgterms:etext');

	$log->logdie("can't find any book nodes in RDF '$arg->{path}'") unless ($booknodes->size > 0);

	my $mapping = [
		[ 'dc:title//text()',		'200', 'a' ],
		[ 'dc:creator//text()',		'700', 'a' ],
		[ 'dc:alternative//text()',	'740', 'a' ],
		[ 'dc:subject//text()',		'650', 'a' ],
		[ 'dc:contributor//text()',	'700', 'a' ],
		[ 'dc:created//text()',		'533', 'd' ],
		[ 'dc:description//text()',	'500', 'a' ],
		[ 'dc:language//text()',	'041', 'a' ],
	];

	$log->info("found ", $booknodes->size, " book nodes, processing");

	my $mfn = 1;

	foreach my $booknode (@$booknodes) {

		# this is a book description node
		my $etext_no = $booknode->getAttribute ('ID');
		$etext_no =~ s/^etext//;

		my $row = {
			'001' => [ $etext_no ],
		};

		foreach my $m ( @$mapping ) {
			my ($xpath,$f,$sf) = @$m;

			foreach my $v ($booknode->findnodes($xpath)) {
				push @{ $row->{$f} }, '^' . $sf . encode_utf8( $v->textContent );
			}

			$log->debug("using $xpath to fill $f^$sf ==> ", dump( $row->{$f} )) if (defined( $row->{$f} ));
		}

		$self->{_rows}->{ $mfn } = $row;
		$log->debug("created row $mfn ", dump( $row ));

		$mfn++;
	}
	$booknodes = undef; # release some memory

	$self->{size} = $mfn - 1;

	$log->info("created ", $self->{size}, " records for ", $arg->{path});

	$self ? return $self : return undef;
}

=head2 fetch_rec

Return record with ID C<$mfn> from database

  my $rec = $ll_db->fetch_rec( $mfn, $filter_coderef );

=cut

sub fetch_rec {
	my $self = shift;

	my ($mfn, $filter_coderef) = @_;

	my $rec = $self->_to_hash(
		mfn => $mfn,
		row => $self->{_rows}->{$mfn},
		hash_filter => $filter_coderef,
	);

	my $log = $self->_get_logger();
	$log->debug("fetch_rec($mfn) = ", dump($rec));

	return $rec;
}

=head2 size

Return number of records in database

  my $size = $ll_db->size;

=cut

sub size {
	my $self = shift;
	return $self->{size};
}

=head2 _to_hash

Return hash from row. Taken from L<Biblio::Isis>

  my $rec = $ll_db->_to_hash(
  	mfn => $mfn;
  	$row
  );

=cut

sub _to_hash {
	my $self = shift;

	my $arg = {@_};

	my $log = $self->_get_logger();

	my $hash_filter = $arg->{hash_filter};
	my $mfn = $arg->{mfn} || $log->logconfess("need mfn in arguments");
	my $row = $arg->{row} || $log->logconfess("need row in arguments");

	# init record to include MFN as field 000
	my $rec = { '000' => [ $mfn ] };

	foreach my $f_nr (keys %{$row}) {
		foreach my $l (@{$row->{$f_nr}}) {

			# filter output
			$l = $hash_filter->($l, $f_nr) if ($hash_filter);
			next unless defined($l);

			my $val;
			my $r_sf;	# repeatable subfields in this record

			# has subfields?
			if ($l =~ m/\^/) {
				foreach my $t (split(/\^/,$l)) {
					next if (! $t);
					my ($sf,$v) = (substr($t,0,1), substr($t,1));
					next unless (defined($v) && $v ne '');

					if (ref( $val->{$sf} ) eq 'ARRAY') {

						push @{ $val->{$sf} }, $v;

						# record repeatable subfield it it's offset
						push @{ $val->{subfields} }, ( $sf, $#{ $val->{$sf} } );
						$r_sf->{$sf}++;

					} elsif (defined( $val->{$sf} )) {

						# convert scalar field to array
						$val->{$sf} = [ $val->{$sf}, $v ];

						push @{ $val->{subfields} }, ( $sf, 1 );
						$r_sf->{$sf}++;

					} else {
						$val->{$sf} = $v;
						push @{ $val->{subfields} }, ( $sf, 0 );
					}
				}
			} else {
				$val = $l;
			}

			push @{$rec->{$f_nr}}, $val;
		}
	}

	return $rec;
}

=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2006 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Input::Gutenberg
